kmer_analysis
Load packages
First we will load in the packages needed to execute this analysis
library(dplyr)
library(readr)
library(tidyr)
library(stringr)
library(magrittr)
library(ggplot2)
library(viridis)
library(ggfortify)
library(PCAtools)
library(edgeR)
library(AnnotationHub)
library(ensembldb)
library(pheatmap)
library(cowplot)
library(ggbeeswarm)
library(tibble)
library(EnsDb.Hsapiens.v86)
library(GenomicFeatures)
library(here)Load Custom Functions
Some methods will be called “hisat2”, this just means that bootstrapping estimates have not been taken into account. Nothing super special going on.
I have also defined several functions here for quality of life purposes
# From 01_Annotations
gene_txp_anno <- read_csv(here("data/annotations/grch38_103_df.csv.gz"))
transcript_key <- gene_txp_anno %>%
dplyr::filter(type == "transcript") %>%
dplyr::select(transcript_id,
transcript_name)
txp_gene_ensdb_lengths <- read_csv(
here("data/annotations/txp_gene_ensdb_lengths.csv.gz")
)
transcript_key <- dplyr::select(txp_gene_ensdb_lengths,
c("transcript_id" = tx_id,
"transcript_name" = tx_name))
# From 02_dtelist_prep
hisat2_dtelist <- read_rds(here("data/dtelists/hisat2_dtelist.rds"))
kallisto_dtelist <- read_rds(here("data/dtelists/kallisto_dtelist.rds"))
sa_dtelist <- read_rds(here("data/dtelists/sa_dtelist.rds"))
star_dtelist <- read_rds(here("data/dtelists/star_dtelist.rds"))
# From 04_principal_component_analysis
hisat2_loadings <- read_csv(
here("data/pca/loadings/hisat2_loadings.csv")
) %>%
head(250)
kallisto_loadings <- read_csv(
here("data/pca/loadings/kallisto_loadings.csv")
) %>%
head(275)
salmon_loadings <- read_csv(
here("data/pca/loadings/salmon_loadings.csv")
) %>%
head(300)
star_loadings <- read_csv(
here("data/pca/loadings/star_loadings.csv")
) %>%
head(275)
# From figure2 make
hisat2_unique <- read_csv(here("data/unique_transcripts/hisat2_unique.csv.gz"))
kallisto_unique <- read_csv(
here("data/unique_transcripts/kallisto_unique.csv.gz")
)
salmon_unique <- read_csv(here("data/unique_transcripts/salmon_unique.csv.gz"))
star_unique <- read_csv(here("data/unique_transcripts/star_unique.csv.gz"))Get Combined Data
joined_df <- get_joined_samples(w = hisat2_dtelist,
x = kallisto_dtelist,
y = star_dtelist,
z = sa_dtelist,
method_w = "HISAT2",
method_x = "Kallisto",
method_y = "STAR",
method_z = "Salmon",
sample = 1)
cor_df <- joined_df
cor_df$variance <- apply(cor_df[,-1], 1, var)lowcor_ensdb <- cor_df %>%
dplyr::arrange(desc(variance)) %>%
head(250) %>%
dplyr::select(transcript_id) %>%
dplyr::left_join(gene_txp_anno,
by = "transcript_id")
lowcor_ensdb <- lowcor_ensdb %>%
dplyr::select(-score,
-phase) %>%
select_if(~ !all(is.na(.))) %>%
dplyr::select(-ccds_id,
-transcript_support_level,
-tag) %>%
drop_na()highcor_ensdb <- cor_df %>%
dplyr::arrange(variance) %>%
head(250) %>%
dplyr::select(transcript_id) %>%
dplyr::left_join(gene_txp_anno,
by = "transcript_id")
highcor_ensdb <- highcor_ensdb %>%
dplyr::select(-score,
-phase) %>%
select_if(~ !all(is.na(.))) %>%
dplyr::select(-ccds_id,
-transcript_support_level,
-tag) %>%
drop_na()Low Correlation Transcripts K-mer Analysis
yTx_lowcor <- exonsBy(edb, filter = TxIdFilter(lowcor_ensdb$transcript_id))
yTxSeqs_lowcor <- extractTranscriptSeqs(dna, yTx_lowcor)
lowcor_df <- yTxSeqs_lowcor %>%
as.data.frame() %>%
rownames_to_column("transcript_id") %>%
mutate(transcript_id = paste0(">", transcript_id)) %>%
set_colnames(c("transcript_id", "sequence")) %>%
as_tibble()
lowcor_df %>%
head(n = 200) %>%
write_delim(here("data/kmer_analysis/correlation/lowcor/lowcor_seqs.txt"),
delim = "\n",
eol = "\n\n",
col_names = FALSE)loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/correlation
cp ${loc}/lowcor/lowcor_seqs.txt ${loc}/lowcor/lowcor_seqs.fa
for i in {1..100}; do
jellyfish count -m ${i} -s 100M --output ${loc}/lowcor/lowcor_mer_counts.jf -C ${loc}/lowcor/lowcor_seqs.fa
jellyfish histo ${loc}/lowcor/lowcor_mer_counts.jf > ${loc}/lowcor/histos/lowcor_${i}_mer_counts_histo.txt
jellyfish stats ${loc}/lowcor/lowcor_mer_counts.jf > ${loc}/lowcor/stats/lowcor_${i}_mer_counts_stats.txt
echo "lowcor ${i}_mer done"
doneKmer data
lowcor_lengths <- txp_gene_ensdb_lengths %>%
dplyr::filter(tx_id %in% lowcor_ensdb$transcript_id)
lowcor_lengths$transcript_length %>% summary()## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 180 1344 2475 3287 4078 27019
lowcor_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/correlation/lowcor/histos"))) {
lowcor_mer_add <- read_delim(
paste0(here("data/kmer_analysis/correlation/lowcor/histos/"), file),
col_names = FALSE,
delim = " ") %>%
set_colnames(c("Repeats", "Frequency")) %>%
mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
lowcor_mer_hist <- rbind(lowcor_mer_hist, lowcor_mer_add)
}
lowcor_mer_hist <- lowcor_mer_hist %>%
mutate(group = "lowcor")
write_csv(lowcor_mer_hist,
here("data/kmer_analysis/correlation/lowcor/lowcor_mer_hist.csv"))
lowcor_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/correlation/lowcor/stats"))) {
lowcor_stat_add <- read_delim(
paste0(here("data/kmer_analysis/correlation/lowcor/stats/"), file),
col_names = FALSE,
delim = " ") %>%
as.data.frame() %>%
column_to_rownames("X1") %>%
mutate_if(is.character, as.numeric) %>%
mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
rownames_to_column("Type") %>%
dplyr::select("Type", "Frequency") %>%
mutate(Type = str_remove(Type, ":")) %>%
dplyr::mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
lowcor_mer_stat <- rbind(lowcor_mer_stat, lowcor_stat_add)
}
lowcor_mer_stat <- lowcor_mer_stat %>%
mutate(group = "lowcor")
write_csv(lowcor_mer_stat,
here("data/kmer_analysis/correlation/lowcor/lowcorcor_mer_stat.csv"))Kmer plotting
Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.
lowcor_lengths <- txp_gene_ensdb_lengths %>%
dplyr::filter(tx_id %in% lowcor_ensdb$transcript_id) %>%
dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
dplyr::mutate(group = "lowcor")
lowcor_lengths$transcript_length %>% summary()## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 180 1344 2475 3287 4078 27019
## . Freq
## 1 0 15
## 2 1 1
## 3 2 1
## 4 3 7
## 5 4 136
## 6 5 443
## 7 6 601
## 8 7 284
## 9 8 149
## 10 9 100
## 11 10 150
## 12 11 112
## 13 12 94
## 14 13 78
## 15 14 70
## 16 15 68
## 17 16 58
## 18 17 62
## 19 18 60
## 20 19 58
## 21 20 56
## 22 21 54
## 23 22 58
## 24 23 56
## 25 24 54
## 26 25 54
## 27 26 52
## 28 27 50
## 29 28 52
## 30 29 52
## 31 30 50
## 32 31 50
## 33 32 46
## 34 33 42
## 35 34 42
## 36 35 40
## 37 36 38
## 38 37 34
## 39 38 34
## 40 39 34
## 41 40 30
## 42 41 30
## 43 42 30
## 44 43 28
## 45 44 28
## 46 45 28
## 47 46 30
## 48 47 30
## 49 48 30
## 50 49 28
## 51 50 28
## 52 51 26
## 53 52 26
## 54 53 26
## 55 54 26
## 56 55 26
## 57 56 26
## 58 57 26
## 59 58 28
## 60 59 28
## 61 60 28
## 62 61 28
## 63 62 28
## 64 63 28
## 65 64 28
## 66 65 28
## 67 66 28
## 68 67 28
## 69 68 30
## 70 69 30
## 71 70 30
## 72 71 30
## 73 72 30
## 74 73 30
## 75 74 30
## 76 75 30
## 77 76 30
## 78 77 30
## 79 78 32
## 80 79 32
## 81 80 32
## 82 81 32
## 83 82 32
## 84 83 32
## 85 84 32
## 86 85 32
## 87 86 32
## 88 87 32
## 89 88 32
## 90 89 32
## 91 90 30
## 92 91 30
## 93 92 30
## 94 93 30
## 95 94 30
## 96 95 30
## 97 96 30
## 98 97 30
## 99 98 30
## 100 99 30
## 101 100 15
lowcor_mer_hist %>%
dplyr::filter(kmer_length >= 31) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ as.character(Repeats),
scales = "free") +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"))lowcor_mer_hist %>%
dplyr::filter(kmer_length <= 20) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"),
legend.position = "none")lowcor_mer_stat %>%
dplyr::filter(!Type == "Max_count") %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ Type) +
theme_bw()High Correlation Transcripts K-mer Analysis
yTx_highcor <- exonsBy(edb, filter = TxIdFilter(highcor_ensdb$transcript_id))
yTxSeqs_highcor <- extractTranscriptSeqs(dna, yTx_highcor)
highcor_df <- yTxSeqs_highcor %>%
as.data.frame() %>%
rownames_to_column("transcript_id") %>%
mutate(transcript_id = paste0(">", transcript_id)) %>%
set_colnames(c("transcript_id", "sequence")) %>%
as_tibble()
highcor_df %>%
head(200) %>%
write_delim(here("data/kmer_analysis/correlation/highcor/highcor_seqs.txt"),
delim = "\n",
eol = "\n\n",
col_names = FALSE)loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/correlation
cp ${loc}/highcor/highcor_seqs.txt ${loc}/highcor/highcor_seqs.fa
for i in {1..100}; do
jellyfish count -m ${i} -s 100M --output ${loc}/highcor/highcor_mer_counts.jf -C ${loc}/highcor/highcor_seqs.fa
jellyfish histo ${loc}/highcor/highcor_mer_counts.jf > ${loc}/highcor/histos/highcor_${i}_mer_counts_histo.txt
jellyfish stats ${loc}/highcor/highcor_mer_counts.jf > ${loc}/highcor/stats/highcor_${i}_mer_counts_stats.txt
echo "highcor ${i}_mer done"
doneKmer data
highcor_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/correlation/highcor/histos"))) {
highcor_mer_add <- read_delim(
paste0(here("data/kmer_analysis/correlation/highcor/histos/"), file),
col_names = FALSE,
delim = " ") %>%
set_colnames(c("Repeats", "Frequency")) %>%
mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
highcor_mer_hist <- rbind(highcor_mer_hist, highcor_mer_add)
}
highcor_mer_hist <- highcor_mer_hist %>%
mutate(group = "highcor")
write_csv(highcor_mer_hist,
here("data/kmer_analysis/correlation/highcor/highcor_mer_hist.csv"))
highcor_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/correlation/highcor/stats"))) {
highcor_stat_add <- read_delim(
paste0(here("data/kmer_analysis/correlation/highcor/stats/"), file),
col_names = FALSE,
delim = " ") %>%
as.data.frame() %>%
column_to_rownames("X1") %>%
mutate_if(is.character, as.numeric) %>%
mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
rownames_to_column("Type") %>%
dplyr::select("Type", "Frequency") %>%
mutate(Type = str_remove(Type, ":")) %>%
dplyr::mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
highcor_mer_stat <- rbind(highcor_mer_stat, highcor_stat_add)
}
highcor_mer_stat <- highcor_mer_stat %>%
mutate(group = "highcor")
write_csv(highcor_mer_stat,
here("data/kmer_analysis/correlation/highcor/highcor_mer_stat.csv"))Kmer plotting
Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.
highcor_lengths <- txp_gene_ensdb_lengths %>%
dplyr::filter(tx_id %in% highcor_ensdb$transcript_id) %>%
dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
dplyr::mutate(group = "highcor")
highcor_lengths$transcript_length %>% summary()## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 527 2125 3423 4057 4876 20635
## . Freq
## 1 0 2
## 2 1 1
## 3 2 1
## 4 3 3
## 5 4 130
## 6 5 469
## 7 6 757
## 8 7 357
## 9 8 170
## 10 9 99
## 11 10 138
## 12 11 110
## 13 12 92
## 14 13 80
## 15 14 70
## 16 15 68
## 17 16 62
## 18 17 52
## 19 18 50
## 20 19 48
## 21 20 46
## 22 21 44
## 23 22 40
## 24 23 38
## 25 24 34
## 26 25 34
## 27 26 30
## 28 27 30
## 29 28 26
## 30 29 26
## 31 30 22
## 32 31 22
## 33 32 22
## 34 33 22
## 35 34 20
## 36 35 22
## 37 36 20
## 38 37 18
## 39 38 16
## 40 39 16
## 41 40 14
## 42 41 16
## 43 42 14
## 44 43 14
## 45 44 14
## 46 45 14
## 47 46 14
## 48 47 14
## 49 48 14
## 50 49 14
## 51 50 12
## 52 51 14
## 53 52 12
## 54 53 12
## 55 54 10
## 56 55 10
## 57 56 10
## 58 57 10
## 59 58 10
## 60 59 10
## 61 60 10
## 62 61 10
## 63 62 8
## 64 63 8
## 65 64 8
## 66 65 6
## 67 66 6
## 68 67 6
## 69 68 6
## 70 69 6
## 71 70 6
## 72 71 6
## 73 72 6
## 74 73 6
## 75 74 6
## 76 75 6
## 77 76 6
## 78 77 6
## 79 78 6
## 80 79 6
## 81 80 6
## 82 81 6
## 83 82 6
## 84 83 6
## 85 84 6
## 86 85 6
## 87 86 6
## 88 87 6
## 89 88 6
## 90 89 6
## 91 90 6
## 92 91 6
## 93 92 6
## 94 93 6
## 95 94 6
## 96 95 6
## 97 96 6
## 98 97 4
## 99 98 4
## 100 99 4
## 101 100 2
highcor_mer_hist %>%
dplyr::filter(kmer_length >= 31) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ as.character(Repeats),
scales = "free") +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"))highcor_mer_hist %>%
dplyr::filter(kmer_length <= 20) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"),
legend.position = "none")highcor_mer_stat %>%
dplyr::filter(!Type == "Max_count") %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ Type) +
theme_bw()K-mer Analysis by PCA Loadings
HISAT2
hs2_tx_remove <- txp_gene_ensdb_lengths %>%
dplyr::filter(tx_id %in% hisat2_loadings$transcript_id) %>%
dplyr::filter(str_detect(seqnames, "^CHR")) %>%
as.data.frame()
hisat2_loadings <- hisat2_loadings %>%
dplyr::filter(!transcript_id %in% hs2_tx_remove$tx_id)
yTx_hisat2 <- exonsBy(edb,
filter = TxIdFilter(hisat2_loadings$transcript_id))
yTxSeqs_hisat2 <- extractTranscriptSeqs(dna, yTx_hisat2)
hisat2_kmer_df <- yTxSeqs_hisat2 %>%
as.data.frame() %>%
rownames_to_column("transcript_id") %>%
mutate(transcript_id = paste0(">", transcript_id)) %>%
set_colnames(c("transcript_id", "sequence")) %>%
as_tibble()
hisat2_kmer_df %>%
head(200) %>%
write_delim(here("data/kmer_analysis/loadings/hisat2/hisat2_seqs.txt"),
delim = "\n",
eol = "\n\n",
col_names = FALSE)loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/loadings
cp ${loc}/hisat2/hisat2_seqs.txt ${loc}/hisat2/hisat2_seqs.fa
for i in {1..100}; do
jellyfish count -m ${i} -s 100M --output ${loc}/hisat2/hisat2_mer_counts.jf -C ${loc}/hisat2/hisat2_seqs.fa
jellyfish histo ${loc}/hisat2/hisat2_mer_counts.jf > ${loc}/hisat2/histos/hisat2_${loc}_mer_counts_histo.txt
jellyfish stats ${loc}/hisat2/hisat2_mer_counts.jf > ${loc}/hisat2/stats/hisat2_${i}_mer_counts_stats.txt
echo "hisat2 ${i}_mer done"
doneKmer data
hisat2_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/loadings/hisat2/histos"))) {
hisat2_mer_add <- read_delim(
paste0(here("data/kmer_analysis/loadings/hisat2/histos/"), file),
col_names = FALSE,
delim = " ") %>%
set_colnames(c("Repeats", "Frequency")) %>%
mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
hisat2_mer_hist <- rbind(hisat2_mer_hist, hisat2_mer_add)
}
hisat2_mer_hist <- hisat2_mer_hist %>%
mutate(group = "hisat2")
write_csv(hisat2_mer_hist,
here("data/kmer_analysis/loadings/hisat2/hisat2_mer_hist.csv"))
hisat2_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/loadings/hisat2/stats"))) {
hisat2_stat_add <- read_delim(
paste0(here("data/kmer_analysis/loadings/hisat2/stats/"), file),
col_names = FALSE,
delim = " ") %>%
as.data.frame() %>%
column_to_rownames("X1") %>%
mutate_if(is.character, as.numeric) %>%
mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
rownames_to_column("Type") %>%
dplyr::select("Type", "Frequency") %>%
mutate(Type = str_remove(Type, ":")) %>%
dplyr::mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
hisat2_mer_stat <- rbind(hisat2_mer_stat, hisat2_stat_add)
}
hisat2_mer_stat <- hisat2_mer_stat %>%
mutate(group = "hisat2")
write_csv(hisat2_mer_stat,
here("data/kmer_analysis/loadings/hisat2/hisat2_mer_stat.csv"))Kmer plotting
Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.
hisat2_lengths <- txp_gene_ensdb_lengths %>%
dplyr::filter(tx_id %in% hisat2_loadings$transcript_id) %>%
dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
dplyr::mutate(group = "hisat2")
hisat2_lengths$transcript_length %>% summary()## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 331 1326 2250 2976 3838 18760
## . Freq
## 1 0 15
## 2 1 1
## 3 2 1
## 4 3 7
## 5 4 134
## 6 5 425
## 7 6 551
## 8 7 263
## 9 8 144
## 10 9 89
## 11 10 70
## 12 11 54
## 13 12 48
## 14 13 38
## 15 14 34
## 16 15 30
## 17 16 24
## 18 17 24
## 19 18 23
## 20 19 23
## 21 20 23
## 22 21 23
## 23 22 23
## 24 23 23
## 25 24 22
## 26 25 22
## 27 26 22
## 28 27 21
## 29 28 22
## 30 29 22
## 31 30 22
## 32 31 22
## 33 32 20
## 34 33 19
## 35 34 19
## 36 35 18
## 37 36 17
## 38 37 15
## 39 38 15
## 40 39 15
## 41 40 13
## 42 41 13
## 43 42 13
## 44 43 12
## 45 44 12
## 46 45 12
## 47 46 13
## 48 47 13
## 49 48 13
## 50 49 12
## 51 50 12
## 52 51 12
## 53 52 12
## 54 53 12
## 55 54 12
## 56 55 12
## 57 56 12
## 58 57 12
## 59 58 12
## 60 59 12
## 61 60 12
## 62 61 12
## 63 62 12
## 64 63 12
## 65 64 12
## 66 65 13
## 67 66 13
## 68 67 13
## 69 68 14
## 70 69 14
## 71 70 14
## 72 71 14
## 73 72 14
## 74 73 14
## 75 74 14
## 76 75 14
## 77 76 14
## 78 77 14
## 79 78 15
## 80 79 15
## 81 80 15
## 82 81 15
## 83 82 15
## 84 83 15
## 85 84 15
## 86 85 15
## 87 86 15
## 88 87 15
## 89 88 15
## 90 89 15
## 91 90 15
## 92 91 15
## 93 92 15
## 94 93 15
## 95 94 15
## 96 95 15
## 97 96 15
## 98 97 15
## 99 98 15
## 100 99 15
hisat2_mer_hist %>%
dplyr::filter(kmer_length >= 31) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ as.character(Repeats),
scales = "free") +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"))hisat2_mer_hist %>%
dplyr::filter(kmer_length <= 20) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"),
legend.position = "none")hisat2_mer_stat %>%
dplyr::filter(!Type == "Max_count") %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ Type) +
theme_bw()Kallisto
k_tx_remove <- txp_gene_ensdb_lengths %>%
dplyr::filter(tx_id %in% kallisto_loadings$transcript_id) %>%
dplyr::filter(str_detect(seqnames, "^CHR")) %>%
as.data.frame()
kallisto_loadings <- kallisto_loadings %>%
dplyr::filter(transcript_id %in% gene_txp_anno$transcript_id)
yTx_kallisto <- exonsBy(edb,
filter = TxIdFilter(kallisto_loadings$transcript_id))
yTxSeqs_kallisto <- extractTranscriptSeqs(dna, yTx_kallisto)
kallisto_kmer_df <- yTxSeqs_kallisto %>%
as.data.frame() %>%
rownames_to_column("transcript_id") %>%
mutate(transcript_id = paste0(">", transcript_id)) %>%
set_colnames(c("transcript_id", "sequence")) %>%
as_tibble()
kallisto_kmer_df %>%
head(200) %>%
write_delim(here("data/kmer_analysis/loadings/kallisto/kallisto_seqs.txt"),
delim = "\n",
eol = "\n\n",
col_names = FALSE)loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/loadings
cp ${loc}/kallisto/kallisto_seqs.txt ${loc}/kallisto/kallisto_seqs.fa
for i in {1..100}; do
jellyfish count -m ${i} -s 100M --output ${loc}/kallisto/kallisto_mer_counts.jf -C ${loc}/kallisto/kallisto_seqs.fa
jellyfish histo ${loc}/kallisto/kallisto_mer_counts.jf > ${loc}/kallisto/histos/kallisto_${i}_mer_counts_histo.txt
jellyfish stats ${loc}/kallisto/kallisto_mer_counts.jf > ${loc}/kallisto/stats/kallisto_${i}_mer_counts_stats.txt
echo "kallisto ${i}_mer done"
doneKmer data
kallisto_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/loadings/kallisto/histos"))) {
kallisto_mer_add <- read_delim(
paste0(here("data/kmer_analysis/loadings/kallisto/histos/"), file),
col_names = FALSE,
delim = " ") %>%
set_colnames(c("Repeats", "Frequency")) %>%
mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
kallisto_mer_hist <- rbind(kallisto_mer_hist, kallisto_mer_add)
}
kallisto_mer_hist <- kallisto_mer_hist %>%
mutate(group = "kallisto")
write_csv(kallisto_mer_hist,
here("data/kmer_analysis/loadings/kallisto/kallisto_mer_hist.csv"))
kallisto_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/loadings/kallisto/stats"))) {
kallisto_stat_add <- read_delim(
paste0(here("data/kmer_analysis/loadings/kallisto/stats/"), file),
col_names = FALSE,
delim = " ") %>%
as.data.frame() %>%
column_to_rownames("X1") %>%
mutate_if(is.character, as.numeric) %>%
mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
rownames_to_column("Type") %>%
dplyr::select("Type", "Frequency") %>%
mutate(Type = str_remove(Type, ":")) %>%
dplyr::mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
kallisto_mer_stat <- rbind(kallisto_mer_stat, kallisto_stat_add)
}
kallisto_mer_stat <- kallisto_mer_stat %>%
mutate(group = "kallisto")
write_csv(kallisto_mer_stat,
here("data/kmer_analysis/loadings/kallisto/kallisto_mer_stat.csv"))Kmer plotting
Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.
kallisto_lengths <- txp_gene_ensdb_lengths %>%
dplyr::filter(tx_id %in% kallisto_loadings$transcript_id) %>%
dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
dplyr::mutate(group = "kallisto")
kallisto_lengths$transcript_length %>% summary()## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 164 1344 2540 3944 3912 205012
## . Freq
## 1 0 24
## 2 1 1
## 3 2 1
## 4 3 5
## 5 4 136
## 6 5 459
## 7 6 631
## 8 7 327
## 9 8 200
## 10 9 137
## 11 10 101
## 12 11 78
## 13 12 66
## 14 13 55
## 15 14 49
## 16 15 47
## 17 16 44
## 18 17 40
## 19 18 42
## 20 19 42
## 21 20 40
## 22 21 40
## 23 22 37
## 24 23 37
## 25 24 35
## 26 25 35
## 27 26 35
## 28 27 34
## 29 28 35
## 30 29 34
## 31 30 34
## 32 31 34
## 33 32 30
## 34 33 29
## 35 34 28
## 36 35 27
## 37 36 27
## 38 37 26
## 39 38 25
## 40 39 25
## 41 40 23
## 42 41 23
## 43 42 22
## 44 43 21
## 45 44 21
## 46 45 21
## 47 46 22
## 48 47 22
## 49 48 22
## 50 49 21
## 51 50 21
## 52 51 21
## 53 52 21
## 54 53 21
## 55 54 21
## 56 55 21
## 57 56 22
## 58 57 22
## 59 58 22
## 60 59 22
## 61 60 22
## 62 61 22
## 63 62 22
## 64 63 21
## 65 64 21
## 66 65 21
## 67 66 21
## 68 67 21
## 69 68 21
## 70 69 19
## 71 70 20
## 72 71 20
## 73 72 20
## 74 73 20
## 75 74 20
## 76 75 20
## 77 76 20
## 78 77 20
## 79 78 21
## 80 79 21
## 81 80 21
## 82 81 21
## 83 82 22
## 84 83 22
## 85 84 22
## 86 85 22
## 87 86 21
## 88 87 22
## 89 88 22
## 90 89 22
## 91 90 22
## 92 91 22
## 93 92 22
## 94 93 22
## 95 94 22
## 96 95 23
## 97 96 23
## 98 97 23
## 99 98 23
## 100 99 23
kallisto_mer_hist %>%
dplyr::filter(kmer_length >= 31) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ as.character(Repeats),
scales = "free") +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"))kallisto_mer_hist %>%
dplyr::filter(kmer_length <= 20) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"),
legend.position = "none")kallisto_mer_stat %>%
dplyr::filter(!Type == "Max_count") %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ Type) +
theme_bw()Salmon
sal_tx_remove <- txp_gene_ensdb_lengths %>%
dplyr::filter(tx_id %in% salmon_loadings$transcript_id) %>%
dplyr::filter(str_detect(seqnames, "^CHR")) %>%
as.data.frame()
salmon_loadings <- salmon_loadings %>%
dplyr::filter(!transcript_id %in% sal_tx_remove$tx_id)
yTx_salmon <- exonsBy(edb, filter = TxIdFilter(salmon_loadings$transcript_id))
yTxSeqs_salmon <- extractTranscriptSeqs(dna, yTx_salmon)
salmon_kmer_df <- yTxSeqs_salmon %>%
as.data.frame() %>%
rownames_to_column("transcript_id") %>%
mutate(transcript_id = paste0(">", transcript_id)) %>%
set_colnames(c("transcript_id", "sequence")) %>%
as_tibble()
# Call head to make sure that we get 200 transcripts for each group!
salmon_kmer_df %>%
head(200) %>%
write_delim(here("data/kmer_analysis/loadings/salmon/salmon_seqs.txt"),
delim = "\n",
eol = "\n\n",
col_names = FALSE)loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/loadings
cp ${loc}/salmon/salmon_seqs.txt ${loc}/salmon/salmon_seqs.fa
for i in {1..100}; do
jellyfish count -m ${i} -s 100M --output ${loc}/salmon/salmon_mer_counts.jf -C ${loc}/salmon/salmon_seqs.fa
jellyfish histo ${loc}/salmon/salmon_mer_counts.jf > ${loc}/salmon/histos/salmon_${i}_mer_counts_histo.txt
jellyfish stats ${loc}/salmon/salmon_mer_counts.jf > ${loc}/salmon/stats/salmon_${i}_mer_counts_stats.txt
echo "salmon ${i}_mer done"
doneKmer data
salmon_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/loadings/salmon/histos"))) {
salmon_mer_add <- read_delim(
paste0(here("data/kmer_analysis/loadings/salmon/histos/"), file),
col_names = FALSE,
delim = " ") %>%
set_colnames(c("Repeats", "Frequency")) %>%
mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
salmon_mer_hist <- rbind(salmon_mer_hist, salmon_mer_add)
}
salmon_mer_hist <- salmon_mer_hist %>%
mutate(group = "salmon")
write_csv(salmon_mer_hist,
here("data/kmer_analysis/loadings/salmon/salmon_mer_hist.csv"))
salmon_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/loadings/salmon/stats"))) {
salmon_stat_add <- read_delim(
paste0(here("data/kmer_analysis/loadings/salmon/stats/"), file),
col_names = FALSE,
delim = " ") %>%
as.data.frame() %>%
column_to_rownames("X1") %>%
mutate_if(is.character, as.numeric) %>%
mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
rownames_to_column("Type") %>%
dplyr::select("Type", "Frequency") %>%
mutate(Type = str_remove(Type, ":")) %>%
dplyr::mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
salmon_mer_stat <- rbind(salmon_mer_stat, salmon_stat_add)
}
salmon_mer_stat <- salmon_mer_stat %>%
mutate(group = "salmon")
write_csv(salmon_mer_stat,
here("data/kmer_analysis/loadings/salmon/salmon_mer_stat.csv"))Kmer plotting
Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.
salmon_lengths <- txp_gene_ensdb_lengths %>%
dplyr::filter(tx_id %in% salmon_loadings$transcript_id) %>%
dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
dplyr::mutate(group = "salmon")
salmon_lengths$transcript_length %>% summary()## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 153 1813 2955 3732 4681 27019
## . Freq
## 1 0 5
## 2 1 1
## 3 2 1
## 4 3 7
## 5 4 135
## 6 5 431
## 7 6 551
## 8 7 256
## 9 8 118
## 10 9 68
## 11 10 48
## 12 11 36
## 13 12 29
## 14 13 26
## 15 14 25
## 16 15 21
## 17 16 21
## 18 17 18
## 19 18 18
## 20 19 17
## 21 20 17
## 22 21 14
## 23 22 12
## 24 23 12
## 25 24 11
## 26 25 10
## 27 26 10
## 28 27 9
## 29 28 9
## 30 29 8
## 31 30 8
## 32 31 7
## 33 32 7
## 34 33 6
## 35 34 6
## 36 35 6
## 37 36 6
## 38 37 6
## 39 38 6
## 40 39 6
## 41 40 6
## 42 41 6
## 43 42 6
## 44 43 6
## 45 44 5
## 46 45 6
## 47 46 5
## 48 47 6
## 49 48 5
## 50 49 5
## 51 50 5
## 52 51 5
## 53 52 5
## 54 53 5
## 55 54 5
## 56 55 5
## 57 56 5
## 58 57 5
## 59 58 5
## 60 59 5
## 61 60 5
## 62 61 5
## 63 62 5
## 64 63 5
## 65 64 5
## 66 65 5
## 67 66 5
## 68 67 5
## 69 68 5
## 70 69 5
## 71 70 5
## 72 71 5
## 73 72 5
## 74 73 5
## 75 74 5
## 76 75 5
## 77 76 5
## 78 77 5
## 79 78 5
## 80 79 5
## 81 80 5
## 82 81 5
## 83 82 5
## 84 83 5
## 85 84 5
## 86 85 5
## 87 86 5
## 88 87 5
## 89 88 5
## 90 89 5
## 91 90 5
## 92 91 5
## 93 92 5
## 94 93 5
## 95 94 5
## 96 95 5
## 97 96 5
## 98 97 5
## 99 98 5
## 100 99 5
salmon_mer_hist %>%
dplyr::filter(kmer_length >= 31) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ as.character(Repeats),
scales = "free") +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"))salmon_mer_hist %>%
dplyr::filter(kmer_length <= 20) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"),
legend.position = "none")salmon_mer_stat %>%
dplyr::filter(!Type == "Max_count") %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ Type) +
theme_bw()STAR
star_loadings <- star_loadings %>%
dplyr::filter(transcript_id %in% gene_txp_anno$transcript_id)
yTx_sa <- exonsBy(edb, filter = TxIdFilter(star_loadings$transcript_id))
yTxSeqs_sa <- extractTranscriptSeqs(dna, yTx_sa)
star_kmer_df <- yTxSeqs_sa %>%
as.data.frame() %>%
rownames_to_column("transcript_id") %>%
mutate(transcript_id = paste0(">", transcript_id)) %>%
set_colnames(c("transcript_id", "sequence")) %>%
as_tibble()
star_kmer_df %>%
head(200) %>%
write_delim(here("data/kmer_analysis/loadings/star/star_seqs.txt"),
delim = "\n",
eol = "\n\n",
col_names = FALSE)loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/loadings
cp ${loc}/star/star_seqs.txt ${loc}/star/star_seqs.fa
for i in {1..100}; do
jellyfish count -m ${i} -s 100M --output ${loc}/star/star_mer_counts.jf -C ${loc}/star/star_seqs.fa
jellyfish histo ${loc}/star/star_mer_counts.jf > ${loc}/star/histos/star_${i}_mer_counts_histo.txt
jellyfish stats ${loc}/star/star_mer_counts.jf > ${loc}/star/stats/star_${i}_mer_counts_stats.txt
echo "sa ${i}_mer done"
doneKmer data
star_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/loadings/star/histos"))) {
star_mer_add <- read_delim(
paste0(here("data/kmer_analysis/loadings/star/histos/"), file),
col_names = FALSE,
delim = " ") %>%
set_colnames(c("Repeats", "Frequency")) %>%
mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
star_mer_hist <- rbind(star_mer_hist, star_mer_add)
}
star_mer_hist <- star_mer_hist %>%
mutate(group = "star")
write_csv(star_mer_hist,
here("data/kmer_analysis/loadings/star/star_mer_hist.csv"))
star_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/loadings/star/stats"))) {
star_stat_add <- read_delim(
paste0(here("data/kmer_analysis/loadings/star/stats/"), file),
col_names = FALSE,
delim = " ") %>%
as.data.frame() %>%
column_to_rownames("X1") %>%
mutate_if(is.character, as.numeric) %>%
mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
rownames_to_column("Type") %>%
dplyr::select("Type", "Frequency") %>%
mutate(Type = str_remove(Type, ":")) %>%
dplyr::mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
star_mer_stat <- rbind(star_mer_stat, star_stat_add)
}
star_mer_stat <- star_mer_stat %>%
mutate(group = "star")
write_csv(star_mer_stat,
here("data/kmer_analysis/loadings/star/star_mer_stat.csv"))Kmer plotting
Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.
star_lengths <- txp_gene_ensdb_lengths %>%
dplyr::filter(tx_id %in% star_loadings$transcript_id) %>%
dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
dplyr::mutate(group = "star")
star_lengths$transcript_length %>% summary()## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 217 1112 2177 3021 3964 20345
## . Freq
## 1 0 5
## 2 1 1
## 3 2 1
## 4 3 8
## 5 4 134
## 6 5 447
## 7 6 538
## 8 7 250
## 9 8 119
## 10 9 66
## 11 10 47
## 12 11 34
## 13 12 29
## 14 13 26
## 15 14 23
## 16 15 20
## 17 16 20
## 18 17 18
## 19 18 18
## 20 19 16
## 21 20 17
## 22 21 14
## 23 22 12
## 24 23 13
## 25 24 12
## 26 25 10
## 27 26 10
## 28 27 9
## 29 28 9
## 30 29 8
## 31 30 8
## 32 31 7
## 33 32 7
## 34 33 6
## 35 34 6
## 36 35 6
## 37 36 6
## 38 37 6
## 39 38 6
## 40 39 6
## 41 40 6
## 42 41 6
## 43 42 6
## 44 43 6
## 45 44 5
## 46 45 6
## 47 46 5
## 48 47 6
## 49 48 5
## 50 49 5
## 51 50 5
## 52 51 5
## 53 52 5
## 54 53 5
## 55 54 5
## 56 55 5
## 57 56 5
## 58 57 5
## 59 58 5
## 60 59 5
## 61 60 5
## 62 61 5
## 63 62 5
## 64 63 5
## 65 64 5
## 66 65 5
## 67 66 5
## 68 67 5
## 69 68 5
## 70 69 5
## 71 70 5
## 72 71 5
## 73 72 5
## 74 73 5
## 75 74 5
## 76 75 5
## 77 76 5
## 78 77 5
## 79 78 5
## 80 79 5
## 81 80 5
## 82 81 5
## 83 82 5
## 84 83 5
## 85 84 5
## 86 85 5
## 87 86 5
## 88 87 5
## 89 88 5
## 90 89 5
## 91 90 5
## 92 91 5
## 93 92 5
## 94 93 5
## 95 94 5
## 96 95 5
## 97 96 5
## 98 97 5
## 99 98 5
## 100 99 5
star_mer_hist %>%
dplyr::filter(kmer_length >= 31) %>%
ggplot(aes(x = kmer_length, y = Repeats)) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7)star_mer_hist %>%
dplyr::filter(kmer_length >= 31) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ as.character(Repeats),
scales = "free") +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"))star_mer_hist %>%
dplyr::filter(kmer_length <= 20) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"),
legend.position = "none")star_mer_stat %>%
dplyr::filter(!Type == "Max_count") %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ Type) +
theme_bw()Plotting
figure 5
lengths_df <- rbind(hisat2_lengths,
kallisto_lengths,
star_lengths,
salmon_lengths)
lengths_by_method <- lengths_df %>%
dplyr::mutate(
group = case_when(
group == "kallisto" ~ "Kallisto",
group == "star" ~ "STAR",
group == "hisat2" ~ "HISAT2",
group == "salmon" ~ "Salmon"
),
group = factor(group, levels = c("Concordant Transcripts",
"Discordant Transcripts",
#"Uncorrelated",
"Top Negative\nPC1 Loadings",
"Top Negative\nPC2 Loadings",
"Top Positive\nPC2 Loadings",
"Top Negative\nPC5 Loadings"))) %>%
ggplot(aes(x = group, y = transcript_length)) +
geom_boxplot(fill = "lightblue") +
scale_y_log10() +
labs(x = "",
y = "Transcript Length (log10)") +
theme_bw()
lengths_by_method %>%
ggsave(filename = "length_by_method_plot.png",
path = here("figures/kmer_analysis/all_methods_plots/"),
device = "png",
height = 200,
width = 200,
units = "mm",
dpi = 400)
txp_lengths_hist <- txp_gene_ensdb_lengths %>%
dplyr::select(tx_id, transcript_length) %>%
mutate(group = "All Transcripts") %>%
rbind(lengths_df %>%
dplyr::select(tx_id, transcript_length) %>%
dplyr::mutate(group = "Top Loadings Transcripts")) %>%
ggplot(aes(x = log10(transcript_length))) +
geom_histogram(bins = 200) +
scale_x_continuous(limits = c(1, 6),
breaks = c(1, 2, 3, 4, 5, 6)) +
labs(x = "Transcript Length (log10)",
y = "Frequency") +
facet_grid(group ~ ., switch = "y", scales = "free") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 12),
axis.title = element_text(colour = "black", size = 14),
strip.background = element_rect(fill = "lightblue"),
strip.text = element_text(colour = "black", size = 14))
txp_lengths_hist %>%
ggsave(filename = "length_plot.png",
path = here("figures/kmer_analysis/all_methods_plots/"),
device = "png",
height = 150,
width = 200,
units = "mm",
dpi = 400)
# We can see that there is stuff all variation in frequency within groups
stat_df <- rbind(#noncor_mer_stat,
lowcor_mer_stat,
highcor_mer_stat,
hisat2_mer_stat,
kallisto_mer_stat,
star_mer_stat,
salmon_mer_stat)
stat_df <- stat_df %>%
dplyr::mutate(kmer_length = case_when(
kmer_length == 0 ~ 100,
.default = kmer_length
))
group_names <- stat_df$group %>% unique()
new_stat_df <- data.frame()
stat_group_chunk <- data.frame()
for(g in group_names) {
for(i in 1:100) {
stat_kmer_chunk <- stat_df %>%
dplyr::filter(kmer_length == i) %>%
dplyr::filter(group == g) %>%
dplyr::select(Type, Frequency) %>%
distinct() %>%
column_to_rownames("Type") %>%
t() %>%
as_tibble() %>%
mutate(Multiplicity = Total-Distinct) %>%
t() %>%
set_colnames("Frequency") %>%
as.data.frame() %>%
rownames_to_column("Type") %>%
mutate(kmer_length = i, group = g)
stat_group_chunk <- rbind(stat_group_chunk, stat_kmer_chunk)
}
new_stat_df <- rbind(new_stat_df, stat_group_chunk)
}
kmer_stat_plotting <- function(x, type, kmer = 31) {
x %>%
dplyr::filter(kmer_length >= kmer,
Type == type) %>%
dplyr::mutate(
group = case_when(
group == "highcor" ~ "Concordant Transcripts",
group == "lowcor" ~ "Discordant Transcripts",
group == "hisat2" ~ "Top Negative\nPC1 Loadings",
group == "kallisto" ~ "Top Negative\nPC2 Loadings",
group == "star" ~ "Top Positive\nPC2 Loadings",
group == "salmon" ~ "Top Negative\nPC5 Loadings"
),
group = factor(group, levels = c("Concordant Transcripts",
"Discordant Transcripts",
"Top Negative\nPC1 Loadings",
"Top Negative\nPC2 Loadings",
"Top Positive\nPC2 Loadings",
"Top Negative\nPC5 Loadings"))) %>%
ggplot(aes(x = kmer_length, y = Frequency)) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
labs(x = "K-mer Length (bp)",
y = "Frequency") +
facet_grid(~ group) +
theme_bw() +
theme(strip.background = element_rect(fill = "lightblue"))
}
type_nms <- new_stat_df$Type %>% unique()
for(i in type_nms) {
kmer_stat_plotting(new_stat_df, type = i)
ggsave(filename = paste0(i, "_kmer_length_freq.png"),
path = here("figures/kmer_analysis/all_methods_plots/"),
device = "png",
height = 150,
width = 300,
units = "mm",
dpi = 400)
}
hist_df <- rbind(lowcor_mer_hist,
highcor_mer_hist,
hisat2_mer_hist,
kallisto_mer_hist,
star_mer_hist,
salmon_mer_hist)
hist_df %>%
dplyr::filter(kmer_length == 31) %>%
dplyr::arrange(Repeats) %>%
ggplot(aes(x = Repeats, y = Frequency)) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ group) +
theme_bw()kmer_numbers
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Unique", kmer_length > 30) %>%
dplyr:: filter(group == "highcor") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :738599
## 1st Qu.:741960
## Median :745325
## Mean :745189
## 3rd Qu.:748582
## Max. :750831
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Unique", kmer_length > 30) %>%
dplyr:: filter(group == "lowcor") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :494126
## 1st Qu.:496865
## Median :499443
## Mean :499336
## 3rd Qu.:501892
## Max. :504011
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Unique", kmer_length > 30) %>%
dplyr:: filter(group == "hisat2") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :425694
## 1st Qu.:427821
## Median :429744
## Mean :429577
## 3rd Qu.:431438
## Max. :432724
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Unique", kmer_length > 30) %>%
dplyr:: filter(group == "kallisto") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :506588
## 1st Qu.:508948
## Median :510892
## Mean :510585
## 3rd Qu.:512473
## Max. :513111
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Unique", kmer_length > 30) %>%
dplyr:: filter(group == "salmon") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :429965
## 1st Qu.:431162
## Median :432096
## Mean :431848
## 3rd Qu.:432664
## Max. :432796
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Unique", kmer_length > 30) %>%
dplyr:: filter(group == "star") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :417408
## 1st Qu.:418605
## Median :419539
## Mean :419261
## 3rd Qu.:420043
## Max. :420167
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
dplyr:: filter(group == "highcor") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. : 58.0
## 1st Qu.: 109.8
## Median : 161.5
## Mean : 259.6
## 3rd Qu.: 308.2
## Max. :1106.0
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
dplyr:: filter(group == "lowcor") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :24463
## 1st Qu.:24982
## Median :25598
## Mean :25684
## 3rd Qu.:26319
## Max. :27337
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
dplyr:: filter(group == "hisat2") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :41457
## 1st Qu.:42300
## Median :43260
## Mean :43375
## 3rd Qu.:44374
## Max. :45830
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
dplyr:: filter(group == "kallisto") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :38474
## 1st Qu.:39326
## Median :40414
## Mean :40636
## 3rd Qu.:41772
## Max. :43845
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
dplyr:: filter(group == "salmon") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :23804
## 1st Qu.:25163
## Median :26670
## Mean :26832
## 3rd Qu.:28365
## Max. :30676
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
dplyr:: filter(group == "star") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :21210
## 1st Qu.:22569
## Median :24076
## Mean :24247
## 3rd Qu.:25783
## Max. :28149
K-mer Analysis by Unique
HISAT2
hs2_tx_remove <- txp_gene_ensdb_lengths %>%
dplyr::filter(tx_id %in% hisat2_unique$transcript_id) %>%
dplyr::filter(str_detect(seqnames, "^CHR")) %>%
as.data.frame()
hisat2_unique <- hisat2_unique %>%
dplyr::filter(!transcript_id %in% hs2_tx_remove$tx_id)
yTx_hisat2 <- exonsBy(edb,
filter = TxIdFilter(hisat2_unique$transcript_id))
yTxSeqs_hisat2 <- extractTranscriptSeqs(dna, yTx_hisat2)
hisat2_kmer_df <- yTxSeqs_hisat2 %>%
as.data.frame() %>%
rownames_to_column("transcript_id") %>%
mutate(transcript_id = paste0(">", transcript_id)) %>%
set_colnames(c("transcript_id", "sequence")) %>%
as_tibble()
hisat2_kmer_df %>%
head(200) %>%
write_delim(here("data/kmer_analysis/unique/hisat2/hisat2_seqs.txt"),
delim = "\n",
eol = "\n\n",
col_names = FALSE)#vol=/Volumes/Fast_T7/R_projects/MethodsChapter/data/sequence_analysis
loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/unique
cp ${loc}/hisat2/hisat2_seqs.txt ${loc}/hisat2/hisat2_seqs.fa
for i in {1..100}; do
jellyfish count -m ${i} -s 100M --output ${loc}/hisat2/hisat2_mer_counts.jf -C ${loc}/hisat2/hisat2_seqs.fa
jellyfish histo ${loc}/hisat2/hisat2_mer_counts.jf > ${loc}/hisat2/histos/hisat2_${loc}_mer_counts_histo.txt
#jellyfish dump ${loc}/hisat2/hisat2_mer_counts.jf > ${vol}/hisat2/dumps/hisat2_${i}_mer_counts_dump.fa
jellyfish stats ${loc}/hisat2/hisat2_mer_counts.jf > ${loc}/hisat2/stats/hisat2_${i}_mer_counts_stats.txt
echo "hisat2 ${i}_mer done"
doneKmer data
hisat2_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/unique/hisat2/histos"))) {
hisat2_mer_add <- read_delim(
paste0(here("data/kmer_analysis/unique/hisat2/histos/"), file),
col_names = FALSE,
delim = " ") %>%
set_colnames(c("Repeats", "Frequency")) %>%
mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
hisat2_mer_hist <- rbind(hisat2_mer_hist, hisat2_mer_add)
}
hisat2_mer_hist <- hisat2_mer_hist %>%
mutate(group = "hisat2")
write_csv(hisat2_mer_hist,
here("data/kmer_analysis/unique/hisat2/hisat2_mer_hist.csv"))
hisat2_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/unique/hisat2/stats"))) {
hisat2_stat_add <- read_delim(
paste0(here("data/kmer_analysis/unique/hisat2/stats/"), file),
col_names = FALSE,
delim = " ") %>%
as.data.frame() %>%
column_to_rownames("X1") %>%
mutate_if(is.character, as.numeric) %>%
mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
rownames_to_column("Type") %>%
dplyr::select("Type", "Frequency") %>%
mutate(Type = str_remove(Type, ":")) %>%
dplyr::mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
hisat2_mer_stat <- rbind(hisat2_mer_stat, hisat2_stat_add)
}
hisat2_mer_stat <- hisat2_mer_stat %>%
mutate(group = "hisat2")
write_csv(hisat2_mer_stat,
here("data/kmer_analysis/unique/hisat2/hisat2_mer_stat.csv"))Kmer plotting
Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.
hisat2_lengths <- txp_gene_ensdb_lengths %>%
dplyr::filter(tx_id %in% hisat2_unique$transcript_id) %>%
dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
dplyr::mutate(group = "hisat2")
hisat2_lengths$transcript_length %>% summary()## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 539 2062 3372 3669 4908 17657
## . Freq
## 1 0 11
## 2 1 1
## 3 2 1
## 4 3 3
## 5 4 128
## 6 5 467
## 7 6 738
## 8 7 358
## 9 8 177
## 10 9 93
## 11 10 58
## 12 11 40
## 13 12 32
## 14 13 29
## 15 14 27
## 16 15 23
## 17 16 19
## 18 17 20
## 19 18 17
## 20 19 16
## 21 20 16
## 22 21 14
## 23 22 14
## 24 23 15
## 25 24 14
## 26 25 14
## 27 26 14
## 28 27 14
## 29 28 14
## 30 29 14
## 31 30 13
## 32 31 12
## 33 32 12
## 34 33 13
## 35 34 13
## 36 35 12
## 37 36 11
## 38 37 12
## 39 38 12
## 40 39 12
## 41 40 12
## 42 41 12
## 43 42 11
## 44 43 11
## 45 44 11
## 46 45 11
## 47 46 11
## 48 47 11
## 49 48 11
## 50 49 11
## 51 50 11
## 52 51 11
## 53 52 11
## 54 53 11
## 55 54 11
## 56 55 11
## 57 56 11
## 58 57 11
## 59 58 11
## 60 59 11
## 61 60 11
## 62 61 11
## 63 62 11
## 64 63 11
## 65 64 11
## 66 65 11
## 67 66 11
## 68 67 11
## 69 68 11
## 70 69 11
## 71 70 11
## 72 71 11
## 73 72 11
## 74 73 11
## 75 74 11
## 76 75 11
## 77 76 11
## 78 77 11
## 79 78 11
## 80 79 11
## 81 80 11
## 82 81 11
## 83 82 11
## 84 83 11
## 85 84 11
## 86 85 11
## 87 86 11
## 88 87 11
## 89 88 11
## 90 89 11
## 91 90 11
## 92 91 11
## 93 92 11
## 94 93 11
## 95 94 11
## 96 95 11
## 97 96 11
## 98 97 11
## 99 98 11
## 100 99 11
hisat2_mer_hist %>%
dplyr::filter(kmer_length >= 31) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ as.character(Repeats),
scales = "free") +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"))hisat2_mer_hist %>%
dplyr::filter(kmer_length <= 20) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"),
legend.position = "none")hisat2_mer_stat %>%
dplyr::filter(!Type == "Max_count") %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ Type) +
theme_bw()Kallisto
k_tx_remove <- txp_gene_ensdb_lengths %>%
dplyr::filter(tx_id %in% kallisto_unique$transcript_id) %>%
dplyr::filter(str_detect(seqnames, "^CHR")) %>%
as.data.frame()
kallisto_unique <- kallisto_unique %>%
dplyr::filter(transcript_id %in% gene_txp_anno$transcript_id)
yTx_kallisto <- exonsBy(edb,
filter = TxIdFilter(kallisto_unique$transcript_id))
yTxSeqs_kallisto <- extractTranscriptSeqs(dna, yTx_kallisto)
kallisto_kmer_df <- yTxSeqs_kallisto %>%
as.data.frame() %>%
rownames_to_column("transcript_id") %>%
mutate(transcript_id = paste0(">", transcript_id)) %>%
set_colnames(c("transcript_id", "sequence")) %>%
as_tibble()
kallisto_kmer_df %>%
head(200) %>%
write_delim(here("data/kmer_analysis/unique/kallisto/kallisto_seqs.txt"),
delim = "\n",
eol = "\n\n",
col_names = FALSE)#vol=/Volumes/Fast_T7/R_projects/MethodsChapter/data/sequence_analysis
loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/unique
cp ${loc}/kallisto/kallisto_seqs.txt ${loc}/kallisto/kallisto_seqs.fa
for i in {1..100}; do
jellyfish count -m ${i} -s 100M --output ${loc}/kallisto/kallisto_mer_counts.jf -C ${loc}/kallisto/kallisto_seqs.fa
jellyfish histo ${loc}/kallisto/kallisto_mer_counts.jf > ${loc}/kallisto/histos/kallisto_${i}_mer_counts_histo.txt
#jellyfish dump ${loc}/kallisto/kallisto_mer_counts.jf > ${vol}/kallisto/dumps/kallisto_${i}_mer_counts_dump.fa
jellyfish stats ${loc}/kallisto/kallisto_mer_counts.jf > ${loc}/kallisto/stats/kallisto_${i}_mer_counts_stats.txt
echo "kallisto ${i}_mer done"
doneKmer data
kallisto_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/unique/kallisto/histos"))) {
kallisto_mer_add <- read_delim(
paste0(here("data/kmer_analysis/unique/kallisto/histos/"), file),
col_names = FALSE,
delim = " ") %>%
set_colnames(c("Repeats", "Frequency")) %>%
mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
kallisto_mer_hist <- rbind(kallisto_mer_hist, kallisto_mer_add)
}
kallisto_mer_hist <- kallisto_mer_hist %>%
mutate(group = "kallisto")
write_csv(kallisto_mer_hist,
here("data/kmer_analysis/unique/kallisto/kallisto_mer_hist.csv"))
kallisto_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/unique/kallisto/stats"))) {
kallisto_stat_add <- read_delim(
paste0(here("data/kmer_analysis/unique/kallisto/stats/"), file),
col_names = FALSE,
delim = " ") %>%
as.data.frame() %>%
column_to_rownames("X1") %>%
mutate_if(is.character, as.numeric) %>%
mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
rownames_to_column("Type") %>%
dplyr::select("Type", "Frequency") %>%
mutate(Type = str_remove(Type, ":")) %>%
dplyr::mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
kallisto_mer_stat <- rbind(kallisto_mer_stat, kallisto_stat_add)
}
kallisto_mer_stat <- kallisto_mer_stat %>%
mutate(group = "kallisto")
write_csv(kallisto_mer_stat,
here("data/kmer_analysis/unique/kallisto/kallisto_mer_stat.csv"))Kmer plotting
Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.
kallisto_lengths <- txp_gene_ensdb_lengths %>%
dplyr::filter(tx_id %in% kallisto_unique$transcript_id) %>%
dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
dplyr::mutate(group = "kallisto")
kallisto_lengths$transcript_length %>% summary()## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 53.0 564.8 999.5 1601.3 2210.2 14269.0
## . Freq
## 1 0 3
## 2 1 1
## 3 2 2
## 4 3 14
## 5 4 133
## 6 5 421
## 7 6 395
## 8 7 171
## 9 8 80
## 10 9 47
## 11 10 29
## 12 11 21
## 13 12 20
## 14 13 17
## 15 14 16
## 16 15 16
## 17 16 14
## 18 17 14
## 19 18 13
## 20 19 11
## 21 20 10
## 22 21 11
## 23 22 10
## 24 23 11
## 25 24 8
## 26 25 10
## 27 26 8
## 28 27 9
## 29 28 7
## 30 29 9
## 31 30 8
## 32 31 9
## 33 32 7
## 34 33 8
## 35 34 7
## 36 35 8
## 37 36 6
## 38 37 6
## 39 38 5
## 40 39 5
## 41 40 4
## 42 41 4
## 43 42 3
## 44 43 3
## 45 44 3
## 46 45 3
## 47 46 3
## 48 47 3
## 49 48 3
## 50 49 3
## 51 50 3
## 52 51 3
## 53 52 3
## 54 53 3
## 55 54 3
## 56 55 3
## 57 56 3
## 58 57 3
## 59 58 3
## 60 59 3
## 61 60 3
## 62 61 3
## 63 62 3
## 64 63 3
## 65 64 3
## 66 65 3
## 67 66 3
## 68 67 3
## 69 68 3
## 70 69 3
## 71 70 3
## 72 71 3
## 73 72 3
## 74 73 3
## 75 74 3
## 76 75 3
## 77 76 3
## 78 77 3
## 79 78 3
## 80 79 3
## 81 80 3
## 82 81 3
## 83 82 3
## 84 83 3
## 85 84 3
## 86 85 3
## 87 86 3
## 88 87 3
## 89 88 3
## 90 89 3
## 91 90 3
## 92 91 3
## 93 92 3
## 94 93 3
## 95 94 3
## 96 95 3
## 97 96 3
## 98 97 3
## 99 98 3
## 100 99 3
kallisto_mer_hist %>%
dplyr::filter(kmer_length >= 31) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ as.character(Repeats),
scales = "free") +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"))kallisto_mer_hist %>%
dplyr::filter(kmer_length <= 20) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"),
legend.position = "none")kallisto_mer_stat %>%
dplyr::filter(!Type == "Max_count") %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ Type) +
theme_bw()Salmon
sal_tx_remove <- txp_gene_ensdb_lengths %>%
dplyr::filter(tx_id %in% salmon_unique$transcript_id) %>%
dplyr::filter(str_detect(seqnames, "^CHR")) %>%
as.data.frame()
salmon_unique <- salmon_unique %>%
dplyr::filter(!transcript_id %in% sal_tx_remove$tx_id)
yTx_salmon <- exonsBy(edb, filter = TxIdFilter(salmon_unique$transcript_id))
yTxSeqs_salmon <- extractTranscriptSeqs(dna, yTx_salmon)
salmon_kmer_df <- yTxSeqs_salmon %>%
as.data.frame() %>%
rownames_to_column("transcript_id") %>%
mutate(transcript_id = paste0(">", transcript_id)) %>%
set_colnames(c("transcript_id", "sequence")) %>%
as_tibble()
# Call head to make sure that we get 200 transcripts for each group!
salmon_kmer_df %>%
head(200) %>%
write_delim(here("data/kmer_analysis/unique/salmon/salmon_seqs.txt"),
delim = "\n",
eol = "\n\n",
col_names = FALSE)#vol=/Volumes/Fast_T7/R_projects/MethodsChapter/data/sequence_analysis
loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/unique
cp ${loc}/salmon/salmon_seqs.txt ${loc}/salmon/salmon_seqs.fa
for i in {1..100}; do
jellyfish count -m ${i} -s 100M --output ${loc}/salmon/salmon_mer_counts.jf -C ${loc}/salmon/salmon_seqs.fa
jellyfish histo ${loc}/salmon/salmon_mer_counts.jf > ${loc}/salmon/histos/salmon_${i}_mer_counts_histo.txt
#jellyfish dump ${loc}/salmon/salmon_mer_counts.jf > ${vol}/salmon/dumps/salmon_${i}_mer_counts_dump.fa
jellyfish stats ${loc}/salmon/salmon_mer_counts.jf > ${loc}/salmon/stats/salmon_${i}_mer_counts_stats.txt
echo "salmon ${i}_mer done"
doneKmer data
salmon_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/unique/salmon/histos"))) {
salmon_mer_add <- read_delim(
paste0(here("data/kmer_analysis/unique/salmon/histos/"), file),
col_names = FALSE,
delim = " ") %>%
set_colnames(c("Repeats", "Frequency")) %>%
mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
salmon_mer_hist <- rbind(salmon_mer_hist, salmon_mer_add)
}
salmon_mer_hist <- salmon_mer_hist %>%
mutate(group = "salmon")
write_csv(salmon_mer_hist,
here("data/kmer_analysis/unique/salmon/salmon_mer_hist.csv"))
salmon_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/unique/salmon/stats"))) {
salmon_stat_add <- read_delim(
paste0(here("data/kmer_analysis/unique/salmon/stats/"), file),
col_names = FALSE,
delim = " ") %>%
as.data.frame() %>%
column_to_rownames("X1") %>%
mutate_if(is.character, as.numeric) %>%
mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
rownames_to_column("Type") %>%
dplyr::select("Type", "Frequency") %>%
mutate(Type = str_remove(Type, ":")) %>%
dplyr::mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
salmon_mer_stat <- rbind(salmon_mer_stat, salmon_stat_add)
}
salmon_mer_stat <- salmon_mer_stat %>%
mutate(group = "salmon")
write_csv(salmon_mer_stat,
here("data/kmer_analysis/unique/salmon/salmon_mer_stat.csv"))Kmer plotting
Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.
salmon_lengths <- txp_gene_ensdb_lengths %>%
dplyr::filter(tx_id %in% salmon_unique$transcript_id) %>%
dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
dplyr::mutate(group = "salmon")
salmon_lengths$transcript_length %>% summary()## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 66.0 540.0 795.5 1424.0 1871.8 13748.0
## . Freq
## 1 0 2
## 2 1 1
## 3 2 2
## 4 3 22
## 5 4 134
## 6 5 404
## 7 6 352
## 8 7 160
## 9 8 79
## 10 9 53
## 11 10 45
## 12 11 38
## 13 12 36
## 14 13 34
## 15 14 30
## 16 15 28
## 17 16 25
## 18 17 24
## 19 18 22
## 20 19 21
## 21 20 21
## 22 21 20
## 23 22 19
## 24 23 17
## 25 24 15
## 26 25 15
## 27 26 15
## 28 27 12
## 29 28 11
## 30 29 11
## 31 30 10
## 32 31 10
## 33 32 8
## 34 33 8
## 35 34 7
## 36 35 7
## 37 36 6
## 38 37 6
## 39 38 6
## 40 39 6
## 41 40 5
## 42 41 4
## 43 42 4
## 44 43 4
## 45 44 3
## 46 45 3
## 47 46 3
## 48 47 3
## 49 48 3
## 50 49 3
## 51 50 3
## 52 51 3
## 53 52 3
## 54 53 3
## 55 54 3
## 56 55 2
## 57 56 2
## 58 57 2
## 59 58 2
## 60 59 2
## 61 60 2
## 62 61 2
## 63 62 2
## 64 63 2
## 65 64 2
## 66 65 2
## 67 66 2
## 68 67 2
## 69 68 2
## 70 69 2
## 71 70 2
## 72 71 2
## 73 72 2
## 74 73 2
## 75 74 2
## 76 75 2
## 77 76 2
## 78 77 2
## 79 78 2
## 80 79 2
## 81 80 2
## 82 81 2
## 83 82 2
## 84 83 2
## 85 84 2
## 86 85 2
## 87 86 2
## 88 87 2
## 89 88 2
## 90 89 2
## 91 90 2
## 92 91 2
## 93 92 2
## 94 93 2
## 95 94 2
## 96 95 2
## 97 96 2
## 98 97 2
## 99 98 2
## 100 99 2
salmon_mer_hist %>%
dplyr::filter(kmer_length >= 31) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ as.character(Repeats),
scales = "free") +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"))salmon_mer_hist %>%
dplyr::filter(kmer_length <= 20) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"),
legend.position = "none")salmon_mer_stat %>%
dplyr::filter(!Type == "Max_count") %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ Type) +
theme_bw()STAR
star_unique <- star_unique %>%
dplyr::filter(transcript_id %in% gene_txp_anno$transcript_id)
yTx_sa <- exonsBy(edb, filter = TxIdFilter(star_unique$transcript_id))
yTxSeqs_sa <- extractTranscriptSeqs(dna, yTx_sa)
star_kmer_df <- yTxSeqs_sa %>%
as.data.frame() %>%
rownames_to_column("transcript_id") %>%
mutate(transcript_id = paste0(">", transcript_id)) %>%
set_colnames(c("transcript_id", "sequence")) %>%
as_tibble()
star_kmer_df %>%
head(200) %>%
write_delim(here("data/kmer_analysis/unique/star/star_seqs.txt"),
delim = "\n",
eol = "\n\n",
col_names = FALSE)
# star_kmer_df %>%
# head(20) %>%
# write_delim(here("data/kmer_analysis/unique/star_test/star_seqs_test.txt"),
# delim = "\n",
# eol = "\n\n",
# col_names = FALSE)#vol=/Volumes/Fast_T7/R_projects/MethodsChapter/data/sequence_analysis
loc=/Users/konstantinosbogias/Documents/PhD/R_Projects/MethodsChapter/data/kmer_analysis/unique
cp ${loc}/star/star_seqs.txt ${loc}/star/star_seqs.fa
for i in {1..100}; do
jellyfish count -m ${i} -s 100M --output ${loc}/star/star_mer_counts.jf -C ${loc}/star/star_seqs.fa
jellyfish histo ${loc}/star/star_mer_counts.jf > ${loc}/star/histos/star_${i}_mer_counts_histo.txt
#jellyfish dump ${loc}/star/star_mer_counts.jf > ${vol}/star/dumps/star_${i}_mer_counts_dump.fa
jellyfish stats ${loc}/star/star_mer_counts.jf > ${loc}/star/stats/star_${i}_mer_counts_stats.txt
echo "sa ${i}_mer done"
doneKmer data
star_mer_hist <- data.frame()
for(file in list.files(here("data/kmer_analysis/unique/star/histos"))) {
star_mer_add <- read_delim(
paste0(here("data/kmer_analysis/unique/star/histos/"), file),
col_names = FALSE,
delim = " ") %>%
set_colnames(c("Repeats", "Frequency")) %>%
mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
star_mer_hist <- rbind(star_mer_hist, star_mer_add)
}
star_mer_hist <- star_mer_hist %>%
mutate(group = "star")
write_csv(star_mer_hist,
here("data/kmer_analysis/unique/star/star_mer_hist.csv"))
star_mer_stat <- data.frame()
for(file in list.files(here("data/kmer_analysis/unique/star/stats"))) {
star_stat_add <- read_delim(
paste0(here("data/kmer_analysis/unique/star/stats/"), file),
col_names = FALSE,
delim = " ") %>%
as.data.frame() %>%
column_to_rownames("X1") %>%
mutate_if(is.character, as.numeric) %>%
mutate(Frequency = coalesce(X2, X3, X4, X5)) %>%
rownames_to_column("Type") %>%
dplyr::select("Type", "Frequency") %>%
mutate(Type = str_remove(Type, ":")) %>%
dplyr::mutate(kmer_length = file %>%
str_extract("...mer") %>%
str_remove("_") %>%
str_remove("_") %>%
str_remove("mer") %>%
as.numeric())
star_mer_stat <- rbind(star_mer_stat, star_stat_add)
}
star_mer_stat <- star_mer_stat %>%
mutate(group = "star")
write_csv(star_mer_stat,
here("data/kmer_analysis/unique/star/star_mer_stat.csv"))Kmer plotting
Wanna try and find long genes with less distinct kmers, so low Distinct and Unique, with a high Total. Possibly a low Max count but a high Repeat number.
star_lengths <- txp_gene_ensdb_lengths %>%
dplyr::filter(tx_id %in% star_unique$transcript_id) %>%
dplyr::select(tx_id, transcript_length, seqnames, strand, tx_biotype) %>%
dplyr::mutate(group = "star")
star_lengths$transcript_length %>% summary()## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 81 506 732 1168 1508 11640
## . Freq
## 1 0 2
## 2 1 1
## 3 2 3
## 4 3 26
## 5 4 133
## 6 5 389
## 7 6 349
## 8 7 150
## 9 8 70
## 10 9 36
## 11 10 24
## 12 11 19
## 13 12 19
## 14 13 17
## 15 14 15
## 16 15 13
## 17 16 13
## 18 17 12
## 19 18 9
## 20 19 8
## 21 20 8
## 22 21 7
## 23 22 7
## 24 23 7
## 25 24 7
## 26 25 6
## 27 26 6
## 28 27 6
## 29 28 6
## 30 29 6
## 31 30 6
## 32 31 5
## 33 32 5
## 34 33 5
## 35 34 4
## 36 35 4
## 37 36 4
## 38 37 4
## 39 38 4
## 40 39 4
## 41 40 4
## 42 41 4
## 43 42 4
## 44 43 4
## 45 44 4
## 46 45 4
## 47 46 3
## 48 47 3
## 49 48 3
## 50 49 3
## 51 50 3
## 52 51 3
## 53 52 3
## 54 53 3
## 55 54 3
## 56 55 3
## 57 56 3
## 58 57 3
## 59 58 3
## 60 59 3
## 61 60 3
## 62 61 3
## 63 62 3
## 64 63 3
## 65 64 3
## 66 65 3
## 67 66 2
## 68 67 2
## 69 68 2
## 70 69 2
## 71 70 2
## 72 71 2
## 73 72 2
## 74 73 2
## 75 74 2
## 76 75 2
## 77 76 2
## 78 77 2
## 79 78 2
## 80 79 2
## 81 80 2
## 82 81 2
## 83 82 2
## 84 83 2
## 85 84 2
## 86 85 2
## 87 86 2
## 88 87 2
## 89 88 2
## 90 89 2
## 91 90 2
## 92 91 2
## 93 92 2
## 94 93 2
## 95 94 2
## 96 95 2
## 97 96 2
## 98 97 2
## 99 98 2
## 100 99 2
star_mer_hist %>%
dplyr::filter(kmer_length >= 31) %>%
ggplot(aes(x = kmer_length, y = Repeats)) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7)star_mer_hist %>%
dplyr::filter(kmer_length >= 31) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ as.character(Repeats),
scales = "free") +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"))star_mer_hist %>%
dplyr::filter(kmer_length <= 20) %>%
dplyr::mutate(Repeats = paste0("Repeat ", as.character(Repeats))) %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = as.character(Repeats))) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
labs(x = "K-mer Length (bp)",
y = "Frequency",
fill = "Kmer") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 10),
axis.title = element_text(colour = "black", size = 12),
strip.text = element_text(colour = "black", size = 12),
strip.background = element_rect(fill = "lightblue"),
legend.position = "none")star_mer_stat %>%
dplyr::filter(!Type == "Max_count") %>%
ggplot(aes(x = kmer_length, y = Frequency, fill = Type)) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ Type) +
theme_bw()Plotting
figure 5
lengths_df <- rbind(highcor_lengths,
lowcor_lengths,
hisat2_lengths,
kallisto_lengths,
star_lengths,
salmon_lengths)
lengths_by_method <- lengths_df %>%
dplyr::mutate(
group = case_when(
group == "highcor" ~ "Concordant",
group == "kallisto" ~ "Kallisto",
group == "star" ~ "STAR",
group == "hisat2" ~ "HISAT2",
group == "salmon" ~ "Salmon",
.default = "Discordant"
),
group = factor(group, levels = c("Concordant",
"Discordant",
"HISAT2",
"Kallisto",
"Salmon",
"STAR"))) %>%
ggplot(aes(x = group, y = transcript_length)) +
geom_boxplot(fill = "lightblue") +
scale_y_log10() +
labs(x = "",
y = "Transcript Length (log10)") +
theme_bw()
lengths_by_method %>%
ggsave(filename = "length_by_method_plot.png",
path = here("figures/kmer_analysis/unique/all_methods_plots/"),
device = "png",
height = 200,
width = 200,
units = "mm",
dpi = 400)
txp_lengths_hist <- txp_gene_ensdb_lengths %>%
dplyr::select(tx_id, transcript_length) %>%
mutate(group = "All Transcripts") %>%
rbind(lengths_df %>%
dplyr::filter(group == "lowcor") %>%
dplyr::select(tx_id, transcript_length) %>%
dplyr::mutate(group = "Discordant Transcripts")) %>%
ggplot(aes(x = log10(transcript_length))) +
geom_histogram(bins = 200) +
scale_x_continuous(limits = c(1, 6),
breaks = c(1, 2, 3, 4, 5, 6)) +
labs(x = "Transcript Length (log10)",
y = "Frequency") +
facet_grid(group ~ ., switch = "y", scales = "free") +
theme_bw() +
theme(axis.text = element_text(colour = "black", size = 12),
axis.title = element_text(colour = "black", size = 14),
strip.background = element_rect(fill = "lightblue"),
strip.text = element_text(colour = "black", size = 14))
txp_lengths_hist %>%
ggsave(filename = "discor_length_plot.png",
path = here("figures/kmer_analysis/unique/all_methods_plots/"),
device = "png",
height = 150,
width = 200,
units = "mm",
dpi = 400)
# We can see that there is stuff all variation in frequency within groups
stat_df <- rbind(highcor_mer_stat,
lowcor_mer_stat,
hisat2_mer_stat,
kallisto_mer_stat,
star_mer_stat,
salmon_mer_stat)
stat_df <- stat_df %>%
dplyr::mutate(kmer_length = case_when(
kmer_length == 0 ~ 100,
.default = kmer_length
))
group_names <- stat_df$group %>% unique()
new_stat_df <- data.frame()
stat_group_chunk <- data.frame()
for(g in group_names) {
for(i in 1:100) {
stat_kmer_chunk <- stat_df %>%
dplyr::filter(kmer_length == i) %>%
dplyr::filter(group == g) %>%
dplyr::select(Type, Frequency) %>%
distinct() %>%
column_to_rownames("Type") %>%
t() %>%
as_tibble() %>%
mutate(Multiplicity = Total-Distinct,
Percent_Multi = Multiplicity/Total,
Percent_Unique = Unique/Total) %>%
t() %>%
set_colnames("Frequency") %>%
as.data.frame() %>%
rownames_to_column("Type") %>%
mutate(kmer_length = i, group = g)
stat_group_chunk <- rbind(stat_group_chunk, stat_kmer_chunk)
}
new_stat_df <- rbind(new_stat_df, stat_group_chunk)
}
kmer_stat_plotting <- function(x, type, kmer = 31) {
x %>%
dplyr::filter(kmer_length >= kmer,
Type == type) %>%
dplyr::mutate(
group = case_when(
group == "highcor" ~ "Concordant Transcripts",
group == "lowcor" ~ "Discordant Transcripts",
group == "hisat2" ~ "HISAT2",
group == "kallisto" ~ "Kallisto",
group == "star" ~ "STAR",
group == "salmon" ~ "Salmon"
),
group = factor(group, levels = c("Concordant Transcripts",
"Discordant Transcripts",
"HISAT2",
"Kallisto",
"STAR",
"Salmon"))) %>%
ggplot(aes(x = kmer_length, y = Frequency)) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
scale_y_log10() +
labs(x = "K-mer Length (bp)",
y = "Frequency") +
facet_grid(~ group) +
theme_bw() +
theme(strip.background = element_rect(fill = "lightblue"))
}
type_nms <- new_stat_df$Type %>% unique()
for(i in type_nms) {
kmer_stat_plotting(new_stat_df, type = i)
ggsave(filename = paste0(i, "_kmer_length_freq.png"),
path = here("figures/kmer_analysis/unique/all_methods_plots_log10/"),
device = "png",
height = 150,
width = 300,
units = "mm",
dpi = 400)
}
hist_df <- rbind(lowcor_mer_hist,
highcor_mer_hist,
hisat2_mer_hist,
kallisto_mer_hist,
star_mer_hist,
salmon_mer_hist)
hist_df %>%
dplyr::filter(kmer_length == 31) %>%
dplyr::arrange(Repeats) %>%
ggplot(aes(x = Repeats, y = Frequency)) +
geom_area(fill = "royalblue",
colour = "darkblue",
linewidth = 1,
alpha = 0.7) +
facet_grid(~ group) +
theme_bw()Lollipop plot for kmers
percent_lolpop <- new_stat_df %>%
dplyr::filter(Type %in% c("Percent_Multi", "Percent_Unique")) %>%
dplyr::mutate(Percent = Frequency*100) %>%
dplyr::filter(kmer_length == 31) %>%
dplyr::mutate(group = case_when(
group == "highcor" ~ "Concordant",
group == "lowcor" ~ "Discordant",
group == "hisat2" ~ "HISAT2",
group == "kallisto" ~ "Kallisto",
group == "salmon" ~ "Salmon",
group == "star" ~ "STAR"
),
Type = case_when(Type == "Percent_Multi" ~ "K-mer Repeats (%)",
.default = "Unique K-mers (%)"),
group = factor(group, levels = rev(c("Concordant", "Discordant",
"HISAT2", "Kallisto",
"Salmon", "STAR")))) %>%
ggplot(aes(x = Percent, y = group)) +
geom_point() +
geom_segment(aes(x = 0, xend = Percent, y = group, yend = group),
colour = "skyblue", linewidth = 1) +
geom_point(colour = "skyblue", size = 6) +
geom_point(colour = "royalblue", size = 4) +
geom_point(colour = "darkblue", size = 2) +
labs(x = "Percentage (%)") +
facet_wrap(~ Type) +
theme_bw() +
theme(axis.title = element_text(colour = "black", size = 12),
axis.text = element_text(colour = "black", size = 10),
strip.text = element_text(colour = "black", size = 11),
strip.background = element_rect(fill = "lightblue"),
axis.title.y = element_blank(),
axis.ticks.y = element_blank())
percent_lolpop %>%
ggsave(filename = "percent_lollipop.png",
path = here("figures/kmer_analysis/unique/lollipops/"),
device = "png",
height = 150,
width = 300,
units = "mm",
dpi = 400)multi_lolpop <- new_stat_df %>%
dplyr::filter(Type == "Multiplicity") %>%
dplyr::mutate(Percent = Frequency*100) %>%
dplyr::filter(kmer_length == 31) %>%
dplyr::mutate(group = case_when(
group == "highcor" ~ "Concordant",
group == "lowcor" ~ "Discordant",
group == "hisat2" ~ "HISAT2",
group == "kallisto" ~ "Kallisto",
group == "salmon" ~ "Salmon",
group == "star" ~ "STAR"
),
group = factor(group, levels = rev(c("Concordant", "Discordant",
"HISAT2", "Kallisto",
"Salmon", "STAR")))) %>%
ggplot(aes(x = Frequency, y = group)) +
geom_segment(aes(x = 0, xend = Frequency, y = group, yend = group),
colour = "skyblue", linewidth = 1) +
geom_point(colour = "skyblue", size = 6) +
geom_point(colour = "royalblue", size = 4) +
geom_point(colour = "darkblue", size = 2) +
labs(x = "K-mer Repeat Frequency") +
scale_x_log10() +
theme_bw() +
theme(axis.title = element_text(colour = "black", size = 12),
axis.text = element_text(colour = "black", size = 10),
axis.text.y = element_text(face = "bold"),
strip.text = element_text(colour = "black", size = 11),
strip.background = element_rect(fill = "lightblue"),
axis.title.y = element_blank(),
axis.ticks.y = element_blank())
multi_lolpop %>%
ggsave(filename = "multi_lollipop.png",
path = here("figures/kmer_analysis/unique/lollipops/"),
device = "png",
height = 200,
width = 250,
units = "mm",
dpi = 400)uniq_dis_lolpop <- new_stat_df %>%
dplyr::filter(kmer_length == 31 & Type %in% c("Unique",
"Distinct",
"Total")) %>%
dplyr::mutate(group = case_when(
group == "highcor" ~ "Concordant",
group == "lowcor" ~ "Discordant",
group == "hisat2" ~ "HISAT2",
group == "kallisto" ~ "Kallisto",
group == "salmon" ~ "Salmon",
group == "star" ~ "STAR"
),
group = factor(group, levels = rev(c("Concordant", "Discordant",
"HISAT2", "Kallisto",
"Salmon", "STAR"))),
Type = factor(Type,
levels = c("Unique", "Distinct", "Total"))) %>%
distinct() %>%
ggplot(aes(x = Frequency, y = group)) +
geom_segment(aes(x = 0, xend = Frequency,
y = group, yend = group),
colour = "skyblue", linewidth = 1) +
geom_point(colour = "skyblue", size = 6) +
geom_point(colour = "royalblue", size = 4) +
geom_point(colour = "darkblue", size = 2) +
scale_x_log10() +
scale_x_continuous(limits = c(0, 8e5)) +
facet_wrap(~ Type, scales = "fixed") +
theme_bw() +
theme(axis.title = element_text(colour = "black", size = 12),
axis.text = element_text(colour = "black", size = 10),
strip.text = element_text(colour = "black", size = 11),
strip.background = element_rect(fill = "lightblue"),
axis.title.y = element_blank(),
axis.ticks.y = element_blank())
uniq_dis_lolpop %>%
ggsave(filename = "uniq_dis_lollipop.png",
path = here("figures/kmer_analysis/unique/lollipops/"),
device = "png",
height = 200,
width = 500,
units = "mm",
dpi = 400)kmer_numbers
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Unique", kmer_length > 30) %>%
dplyr:: filter(group == "highcor") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :738599
## 1st Qu.:741960
## Median :745325
## Mean :745189
## 3rd Qu.:748582
## Max. :750831
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Unique", kmer_length > 30) %>%
dplyr:: filter(group == "lowcor") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :494126
## 1st Qu.:496865
## Median :499443
## Mean :499336
## 3rd Qu.:501892
## Max. :504011
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Unique", kmer_length > 30) %>%
dplyr:: filter(group == "hisat2") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :480268
## 1st Qu.:481822
## Median :483368
## Mean :483303
## 3rd Qu.:484817
## Max. :486058
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Unique", kmer_length > 30) %>%
dplyr:: filter(group == "kallisto") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :282407
## 1st Qu.:285575
## Median :288716
## Mean :288627
## 3rd Qu.:291755
## Max. :294368
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Unique", kmer_length > 30) %>%
dplyr:: filter(group == "salmon") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :238655
## 1st Qu.:241830
## Median :245008
## Mean :244930
## 3rd Qu.:248152
## Max. :250567
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Unique", kmer_length > 30) %>%
dplyr:: filter(group == "star") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :230823
## 1st Qu.:233704
## Median :236542
## Mean :236315
## 3rd Qu.:239023
## Max. :240967
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
dplyr:: filter(group == "highcor") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. : 58.0
## 1st Qu.: 109.8
## Median : 161.5
## Mean : 259.6
## 3rd Qu.: 308.2
## Max. :1106.0
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
dplyr:: filter(group == "lowcor") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :24463
## 1st Qu.:24982
## Median :25598
## Mean :25684
## 3rd Qu.:26319
## Max. :27337
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
dplyr:: filter(group == "hisat2") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :162635
## 1st Qu.:163910
## Median :165190
## Mean :165226
## 3rd Qu.:166523
## Max. :167974
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
dplyr:: filter(group == "kallisto") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :4686
## 1st Qu.:4824
## Median :4978
## Mean :5028
## 3rd Qu.:5188
## Max. :5652
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
dplyr:: filter(group == "salmon") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. : 9374
## 1st Qu.: 9491
## Median : 9627
## Mean : 9685
## 3rd Qu.: 9789
## Max. :10491
new_stat_df %>%
distinct() %>%
dplyr::filter(Type == "Multiplicity", kmer_length > 30) %>%
dplyr:: filter(group == "star") %>%
dplyr::select("Frequency") %>%
summary()## Frequency
## Min. :3599
## 1st Qu.:3875
## Median :4181
## Mean :4306
## 3rd Qu.:4678
## Max. :5494